5  Model

Code
# train_model.py
import os, json, inspect
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
import networkx as nx
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss
from xgboost import XGBClassifier
import joblib

# ---------- config ----------
BASE_DIR  = Path("/Users/yifanw124/STAT468/stat468-final-project")
DATA_PATH = BASE_DIR / "tournaments_2018_2025_June.csv"
OUT_DIR   = BASE_DIR
OUT_MODEL = OUT_DIR / "stack_model.joblib"
OUT_META  = OUT_DIR / "feature_spec.json"

PIN_TO_S3          = os.getenv("PIN_TO_S3", "false").lower() == "true"
USE_VETIVER_BUNDLE = os.getenv("USE_VETIVER", "false").lower() == "true"
RANDOM_STATE       = 42

MODEL_BUCKET = os.getenv("MODEL_BUCKET", "")         
MODEL_PIN    = os.getenv("MODEL_PIN", "stack_model")

# ---------- load ----------
df0 = pd.read_csv(DATA_PATH)
df0 = df0[df0["event"].str.contains("MS|WS", regex=True)].copy()
df0["date"] = pd.to_datetime(df0["date"])
df0 = df0.sort_values("date").reset_index(drop=True)

# ---------- Elo (online, no leakage) ----------
DEFAULT_ELO = 1200
K = 32
elo = defaultdict(lambda: DEFAULT_ELO)

def expected_score(rA, rB):
    return 1 / (1 + 10 ** ((rB - rA) / 400))

def update_elo(rA, rB, outcome_A):
    eA = expected_score(rA, rB)
    rA_new = rA + K * (outcome_A - eA)
    rB_new = rB + K * ((1 - outcome_A) - (1 - eA))
    return rA_new, rB_new

rows = []
for _, r in df0.iterrows():
    p1, p2 = str(r["player1"]), str(r["player2"])
    out1 = 1 if int(r["winner"]) == 1 else 0
    r1, r2 = elo[p1], elo[p2]
    sd = float(r["team1_total_points"] - r["team2_total_points"])

    # features BEFORE updating Elo to avoid leakage
    rows.append({
        "player_id": p1, "opponent_id": p2,
        "elo_player": r1, "elo_opponent": r2,
        "elo_diff": r1 - r2,
        "score_diff": sd,
        "win": out1,
        "date": r["date"],
        "tournament": r.get("tournament_name", None),
        "event": r["event"],
    })
    rows.append({
        "player_id": p2, "opponent_id": p1,
        "elo_player": r2, "elo_opponent": r1,
        "elo_diff": r2 - r1,
        "score_diff": -sd,
        "win": 1 - out1,
        "date": r["date"],
        "tournament": r.get("tournament_name", None),
        "event": r["event"],
    })

    elo[p1], elo[p2] = update_elo(r1, r2, out1)

df = pd.DataFrame(rows).sort_values("date").reset_index(drop=True)

# ---------- Rolling win% (shifted) ----------
for w in (5, 10, 20):
    df[f"win_pct_{w}"] = (
        df.groupby("player_id")["win"]
          .transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
    )

# ---------- H2H exponential decay (shifted) ----------
alpha = 0.1
df["h2h_decay"] = (
    df.groupby(["player_id", "opponent_id"])["win"]
      .transform(lambda s: s.shift(1).ewm(alpha=alpha, adjust=False).mean())
)

# Opponent strength adjust (safe divide)
df["h2h_adj"] = (
    df["h2h_decay"] * (df["elo_opponent"] / df["elo_player"].replace(0, np.nan))
).fillna(0.0)

# ---------- Time-based split ----------
date_cut = df["date"].quantile(0.80)
df_tr = df[df["date"] <= date_cut].copy()
df_te = df[df["date"] >  date_cut].copy()

# ---------- PageRank (train period only) ----------
G = nx.DiGraph()
for _, rr in df_tr.iterrows():
    if rr["win"] == 1:
        G.add_edge(rr["opponent_id"], rr["player_id"])
pagerank = nx.pagerank(G, alpha=0.85) if G.number_of_nodes() > 0 else {}

df["pr_player"]   = df["player_id"].map(lambda x: pagerank.get(x, 0.0)).astype(float)
df["pr_opponent"] = df["opponent_id"].map(lambda x: pagerank.get(x, 0.0)).astype(float)

# Re-split after PR
df_tr = df[df["date"] <= date_cut].copy()
df_te = df[df["date"] >  date_cut].copy()

5.1 Temporal Train-Test Split

The dataset was split into training and testing partitions using the 80th percentile of the date distribution as a cutoff. All features were computed with respect to this split, and PageRank was recomputed using only training matches. This design enforces a realistic, forward-in-time prediction setting that mirrors actual deployment conditions, ensuring that the evaluation reflects true out-of-sample performance.

Code
# ---------- Features / target ----------
FEATURES = [
    "elo_diff",
    "win_pct_5", "win_pct_10", "win_pct_20",
    "h2h_decay", "h2h_adj",
    "pr_player", "pr_opponent",
]
for c in FEATURES:
    df_tr[c] = pd.to_numeric(df_tr[c], errors="coerce").fillna(0.0)
    df_te[c] = pd.to_numeric(df_te[c], errors="coerce").fillna(0.0)

X_train, y_train = df_tr[FEATURES], df_tr["win"].astype(int)
X_test,  y_test  = df_te[FEATURES], df_te["win"].astype(int)

5.2 XGBoost

In the modelling stage, the approach begins with specifying a tuned gradient boosting model. An XGBClassifier is configured with parameters selected to balance predictive power with generalization, including a moderate tree depth, a learning rate that encourages gradual updates, and column and row subsampling to introduce diversity in the fitted trees. The chosen evaluation metric, log loss, reflects a focus on producing well-calibrated probabilities rather than simply maximizing classification accuracy. A fixed random seed is set to ensure results are reproducible. Categorical features are explicitly disabled within the model to align with the preprocessing pipeline used earlier in the workflow.

Code
# ---------- Model ----------
best_xgb_params = {
    "n_estimators": 214,
    "max_depth": 8,
    "learning_rate": 0.05801866004578234,
    "subsample": 0.80,
    "colsample_bytree": 0.75,
    "eval_metric": "logloss",
    "random_state": RANDOM_STATE,
    "enable_categorical": False,
}
xgb = XGBClassifier(**best_xgb_params)

5.3 Stacking XGBoost and Logistic Regression

Rather than relying on a single model, the strategy combines two models through a stacking ensemble. Here, a logistic regression and the tuned XGBoost model serve as base learners. Each has different inductive biases: the linear model captures straightforward additive relationships, while the boosted trees capture non-linear interactions and thresholds. The outputs of these base models, along with the original features (passthrough=True), are fed into a final logistic regression meta-model, which learns how to weight and combine the different perspectives.

Code
estimators = [
    ("lr", LogisticRegression(max_iter=1_000, random_state=RANDOM_STATE)),
    ("xgb", xgb),
]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1_000, random_state=RANDOM_STATE),
    cv=5,
    passthrough=True,
    n_jobs=-1,
)

stack.fit(X_train, y_train)
y_prob = stack.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

5.4 Results

The fitted stacking classifier is then applied to the held-out test set to produce predicted probabilities. Performance is evaluated using three complementary metrics under a temporal train–test split, which reflects real-world deployment where future matches are predicted from past data. The ROC AUC score measures the model’s ability to rank winners above losers, accuracy captures the proportion of correct classifications, and the Brier score assesses the calibration of predicted probabilities, which penalizes overconfident errors.

Code
print(f"[Temporal split] ROC AUC : {roc_auc_score(y_test, y_prob):.6f}")
print(f"[Temporal split] Accuracy: {accuracy_score(y_test, y_pred):.6f}")
print(f"[Temporal split] Brier   : {brier_score_loss(y_test, y_prob):.6f}")
[Temporal split] ROC AUC : 0.757130
[Temporal split] Accuracy: 0.689982
[Temporal split] Brier   : 0.208891